{
  "cells": [
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "AxstrLjxEEL2"
      },
      "source": [
        "<img src=\"https://github.com/chdb-io/chdb/raw/main/docs/_static/snake-chdb.png\" height=100>\n",
        "\n",
        "# chDB Colab\n",
        "\n",
        "Compare chDB / ClickHouse and Pandas performance with .Parquet files"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "-kqtWblkoujV"
      },
      "source": [
        "## Setup\n",
        "First we download some files and install the required libraries."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "39OPDvzADNMz",
        "outputId": "3e2286f2-90a7-4188-d7d1-2bf87c904ad9"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
            "Requirement already satisfied: pyarrow in /usr/local/lib/python3.10/dist-packages (9.0.0)\n",
            "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (1.5.3)\n",
            "Requirement already satisfied: numpy>=1.16.6 in /usr/local/lib/python3.10/dist-packages (from pyarrow) (1.22.4)\n",
            "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2.8.2)\n",
            "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas) (2022.7.1)\n",
            "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas) (1.16.0)\n",
            "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
            "Collecting chdb\n",
            "  Downloading chdb-0.10.2-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (109.0 MB)\n",
            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m109.0/109.0 MB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
            "\u001b[?25hInstalling collected packages: chdb\n",
            "Successfully installed chdb-0.10.2\n",
            "--2023-06-15 20:34:08--  https://github.com/cwida/duckdb-data/releases/download/v1.0/taxi_2019_04.parquet\n",
            "Resolving github.com (github.com)... 140.82.114.3\n",
            "Connecting to github.com (github.com)|140.82.114.3|:443... connected.\n",
            "HTTP request sent, awaiting response... 301 Moved Permanently\n",
            "Location: https://github.com/duckdb/duckdb-data/releases/download/v1.0/taxi_2019_04.parquet [following]\n",
            "--2023-06-15 20:34:09--  https://github.com/duckdb/duckdb-data/releases/download/v1.0/taxi_2019_04.parquet\n",
            "Reusing existing connection to github.com:443.\n",
            "HTTP request sent, awaiting response... 302 Found\n",
            "Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/263853960/d481ee00-d5d4-11eb-85f7-c0e266116772?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230615%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230615T203409Z&X-Amz-Expires=300&X-Amz-Signature=bd3e2e08a6b8793c8775ec6d1e2f3017489e79a882638ccbc0d10c923ffde839&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=263853960&response-content-disposition=attachment%3B%20filename%3Dtaxi_2019_04.parquet&response-content-type=application%2Foctet-stream [following]\n",
            "--2023-06-15 20:34:09--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/263853960/d481ee00-d5d4-11eb-85f7-c0e266116772?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230615%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230615T203409Z&X-Amz-Expires=300&X-Amz-Signature=bd3e2e08a6b8793c8775ec6d1e2f3017489e79a882638ccbc0d10c923ffde839&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=263853960&response-content-disposition=attachment%3B%20filename%3Dtaxi_2019_04.parquet&response-content-type=application%2Foctet-stream\n",
            "Resolving objects.githubusercontent.com (objects.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...\n",
            "Connecting to objects.githubusercontent.com (objects.githubusercontent.com)|185.199.108.133|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 127056503 (121M) [application/octet-stream]\n",
            "Saving to: ‘taxi/201904.parquet’\n",
            "\n",
            "taxi/201904.parquet 100%[===================>] 121.17M   147MB/s    in 0.8s    \n",
            "\n",
            "2023-06-15 20:34:10 (147 MB/s) - ‘taxi/201904.parquet’ saved [127056503/127056503]\n",
            "\n",
            "--2023-06-15 20:34:10--  https://github.com/cwida/duckdb-data/releases/download/v1.0/taxi_2019_05.parquet\n",
            "Resolving github.com (github.com)... 140.82.114.3\n",
            "Connecting to github.com (github.com)|140.82.114.3|:443... connected.\n",
            "HTTP request sent, awaiting response... 301 Moved Permanently\n",
            "Location: https://github.com/duckdb/duckdb-data/releases/download/v1.0/taxi_2019_05.parquet [following]\n",
            "--2023-06-15 20:34:10--  https://github.com/duckdb/duckdb-data/releases/download/v1.0/taxi_2019_05.parquet\n",
            "Reusing existing connection to github.com:443.\n",
            "HTTP request sent, awaiting response... 302 Found\n",
            "Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/263853960/f2525180-d5da-11eb-83dc-697493d2d923?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230615%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230615T203410Z&X-Amz-Expires=300&X-Amz-Signature=68c61fa3485c42ce34b690014291c94b982852b5f3a4b68fcf40749e63012e1f&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=263853960&response-content-disposition=attachment%3B%20filename%3Dtaxi_2019_05.parquet&response-content-type=application%2Foctet-stream [following]\n",
            "--2023-06-15 20:34:10--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/263853960/f2525180-d5da-11eb-83dc-697493d2d923?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230615%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230615T203410Z&X-Amz-Expires=300&X-Amz-Signature=68c61fa3485c42ce34b690014291c94b982852b5f3a4b68fcf40749e63012e1f&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=263853960&response-content-disposition=attachment%3B%20filename%3Dtaxi_2019_05.parquet&response-content-type=application%2Foctet-stream\n",
            "Resolving objects.githubusercontent.com (objects.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...\n",
            "Connecting to objects.githubusercontent.com (objects.githubusercontent.com)|185.199.109.133|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 130095527 (124M) [application/octet-stream]\n",
            "Saving to: ‘taxi/201905.parquet’\n",
            "\n",
            "taxi/201905.parquet 100%[===================>] 124.07M   166MB/s    in 0.7s    \n",
            "\n",
            "2023-06-15 20:34:11 (166 MB/s) - ‘taxi/201905.parquet’ saved [130095527/130095527]\n",
            "\n",
            "--2023-06-15 20:34:11--  https://github.com/cwida/duckdb-data/releases/download/v1.0/taxi_2019_06.parquet\n",
            "Resolving github.com (github.com)... 140.82.114.3\n",
            "Connecting to github.com (github.com)|140.82.114.3|:443... connected.\n",
            "HTTP request sent, awaiting response... 301 Moved Permanently\n",
            "Location: https://github.com/duckdb/duckdb-data/releases/download/v1.0/taxi_2019_06.parquet [following]\n",
            "--2023-06-15 20:34:11--  https://github.com/duckdb/duckdb-data/releases/download/v1.0/taxi_2019_06.parquet\n",
            "Reusing existing connection to github.com:443.\n",
            "HTTP request sent, awaiting response... 302 Found\n",
            "Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/263853960/288fd100-d5db-11eb-887a-36389f711f61?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230615%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230615T203411Z&X-Amz-Expires=300&X-Amz-Signature=a7c4e5da2c95bb228fd73138971c2759d2f6c17157ccd9a8c50f6d73551befd3&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=263853960&response-content-disposition=attachment%3B%20filename%3Dtaxi_2019_06.parquet&response-content-type=application%2Foctet-stream [following]\n",
            "--2023-06-15 20:34:11--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/263853960/288fd100-d5db-11eb-887a-36389f711f61?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20230615%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20230615T203411Z&X-Amz-Expires=300&X-Amz-Signature=a7c4e5da2c95bb228fd73138971c2759d2f6c17157ccd9a8c50f6d73551befd3&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=263853960&response-content-disposition=attachment%3B%20filename%3Dtaxi_2019_06.parquet&response-content-type=application%2Foctet-stream\n",
            "Resolving objects.githubusercontent.com (objects.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...\n",
            "Connecting to objects.githubusercontent.com (objects.githubusercontent.com)|185.199.109.133|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 120790979 (115M) [application/octet-stream]\n",
            "Saving to: ‘taxi/201906.parquet’\n",
            "\n",
            "taxi/201906.parquet 100%[===================>] 115.19M   118MB/s    in 1.0s    \n",
            "\n",
            "2023-06-15 20:34:12 (118 MB/s) - ‘taxi/201906.parquet’ saved [120790979/120790979]\n",
            "\n"
          ]
        }
      ],
      "source": [
        "!pip install pyarrow pandas\n",
        "!pip install chdb --pre --upgrade\n",
        "\n",
        "!mkdir -p taxi\n",
        "!wget https://github.com/cwida/duckdb-data/releases/download/v1.0/taxi_2019_04.parquet -O taxi/201904.parquet\n",
        "!wget https://github.com/cwida/duckdb-data/releases/download/v1.0/taxi_2019_05.parquet -O taxi/201905.parquet\n",
        "!wget https://github.com/cwida/duckdb-data/releases/download/v1.0/taxi_2019_06.parquet -O taxi/201906.parquet"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 81
        },
        "id": "imYHieGpD7_e",
        "outputId": "9e5f1e88-e366-4629-c25d-94d7f48033fa"
      },
      "outputs": [
        {
          "data": {
            "text/html": [
              "\n",
              "  <div id=\"df-4c46373d-23d8-4701-af4f-d4e54d2f30bf\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>version()</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>b'22.12.1.1'</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-4c46373d-23d8-4701-af4f-d4e54d2f30bf')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-4c46373d-23d8-4701-af4f-d4e54d2f30bf button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-4c46373d-23d8-4701-af4f-d4e54d2f30bf');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ],
            "text/plain": [
              "      version()\n",
              "0  b'22.12.1.1'"
            ]
          },
          "execution_count": 2,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "import pyarrow.parquet as pq\n",
        "import pandas\n",
        "import glob\n",
        "import chdb\n",
        "\n",
        "# no setup for chDB\n",
        "chdb.query(\"SELECT version()\", 'Dataframe')"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "sGtaQW0L11LT"
      },
      "source": [
        "## Reading Multiple Parquet Files\n",
        "\n",
        "chDB can read multiple parquet files using the glob syntax.\n",
        "\n",
        "In Pandas, we need to load the files separately and concatenate them together into a single DataFrame."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 391
        },
        "id": "1_Kl55nvhfPy",
        "outputId": "7f4e1ba6-1980-4d5b-802e-826084f2a058"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "CPU times: user 57.9 ms, sys: 20.1 ms, total: 78 ms\n",
            "Wall time: 80.8 ms\n"
          ]
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <div id=\"df-9d1fae28-eaca-4b64-9826-c0e3cd5ab974\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>vendor_id</th>\n",
              "      <th>pickup_at</th>\n",
              "      <th>dropoff_at</th>\n",
              "      <th>passenger_count</th>\n",
              "      <th>trip_distance</th>\n",
              "      <th>rate_code_id</th>\n",
              "      <th>store_and_fwd_flag</th>\n",
              "      <th>pickup_location_id</th>\n",
              "      <th>dropoff_location_id</th>\n",
              "      <th>payment_type</th>\n",
              "      <th>fare_amount</th>\n",
              "      <th>extra</th>\n",
              "      <th>mta_tax</th>\n",
              "      <th>tip_amount</th>\n",
              "      <th>tolls_amount</th>\n",
              "      <th>improvement_surcharge</th>\n",
              "      <th>total_amount</th>\n",
              "      <th>congestion_surcharge</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>b'1'</td>\n",
              "      <td>2019-04-01 00:04:09+00:00</td>\n",
              "      <td>2019-04-01 00:06:35+00:00</td>\n",
              "      <td>1</td>\n",
              "      <td>0.5</td>\n",
              "      <td>b'1'</td>\n",
              "      <td>b'N'</td>\n",
              "      <td>239</td>\n",
              "      <td>239</td>\n",
              "      <td>b'1'</td>\n",
              "      <td>4.0</td>\n",
              "      <td>3.0</td>\n",
              "      <td>0.5</td>\n",
              "      <td>1.00</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.3</td>\n",
              "      <td>8.80</td>\n",
              "      <td>2.5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>b'1'</td>\n",
              "      <td>2019-04-01 00:22:45+00:00</td>\n",
              "      <td>2019-04-01 00:25:43+00:00</td>\n",
              "      <td>1</td>\n",
              "      <td>0.7</td>\n",
              "      <td>b'1'</td>\n",
              "      <td>b'N'</td>\n",
              "      <td>230</td>\n",
              "      <td>100</td>\n",
              "      <td>b'2'</td>\n",
              "      <td>4.5</td>\n",
              "      <td>3.0</td>\n",
              "      <td>0.5</td>\n",
              "      <td>0.00</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.3</td>\n",
              "      <td>8.30</td>\n",
              "      <td>2.5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>b'1'</td>\n",
              "      <td>2019-04-01 00:39:48+00:00</td>\n",
              "      <td>2019-04-01 01:19:39+00:00</td>\n",
              "      <td>1</td>\n",
              "      <td>10.9</td>\n",
              "      <td>b'1'</td>\n",
              "      <td>b'N'</td>\n",
              "      <td>68</td>\n",
              "      <td>127</td>\n",
              "      <td>b'1'</td>\n",
              "      <td>36.0</td>\n",
              "      <td>3.0</td>\n",
              "      <td>0.5</td>\n",
              "      <td>7.95</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.3</td>\n",
              "      <td>47.75</td>\n",
              "      <td>2.5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>b'1'</td>\n",
              "      <td>2019-04-01 00:35:32+00:00</td>\n",
              "      <td>2019-04-01 00:37:11+00:00</td>\n",
              "      <td>1</td>\n",
              "      <td>0.2</td>\n",
              "      <td>b'1'</td>\n",
              "      <td>b'N'</td>\n",
              "      <td>68</td>\n",
              "      <td>68</td>\n",
              "      <td>b'2'</td>\n",
              "      <td>3.5</td>\n",
              "      <td>3.0</td>\n",
              "      <td>0.5</td>\n",
              "      <td>0.00</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.3</td>\n",
              "      <td>7.30</td>\n",
              "      <td>2.5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>b'1'</td>\n",
              "      <td>2019-04-01 00:44:05+00:00</td>\n",
              "      <td>2019-04-01 00:57:58+00:00</td>\n",
              "      <td>1</td>\n",
              "      <td>4.8</td>\n",
              "      <td>b'1'</td>\n",
              "      <td>b'N'</td>\n",
              "      <td>50</td>\n",
              "      <td>42</td>\n",
              "      <td>b'1'</td>\n",
              "      <td>15.5</td>\n",
              "      <td>3.0</td>\n",
              "      <td>0.5</td>\n",
              "      <td>3.85</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.3</td>\n",
              "      <td>23.15</td>\n",
              "      <td>2.5</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-9d1fae28-eaca-4b64-9826-c0e3cd5ab974')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-9d1fae28-eaca-4b64-9826-c0e3cd5ab974 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-9d1fae28-eaca-4b64-9826-c0e3cd5ab974');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ],
            "text/plain": [
              "  vendor_id                 pickup_at                dropoff_at  \\\n",
              "0      b'1' 2019-04-01 00:04:09+00:00 2019-04-01 00:06:35+00:00   \n",
              "1      b'1' 2019-04-01 00:22:45+00:00 2019-04-01 00:25:43+00:00   \n",
              "2      b'1' 2019-04-01 00:39:48+00:00 2019-04-01 01:19:39+00:00   \n",
              "3      b'1' 2019-04-01 00:35:32+00:00 2019-04-01 00:37:11+00:00   \n",
              "4      b'1' 2019-04-01 00:44:05+00:00 2019-04-01 00:57:58+00:00   \n",
              "\n",
              "   passenger_count  trip_distance rate_code_id store_and_fwd_flag  \\\n",
              "0                1            0.5         b'1'               b'N'   \n",
              "1                1            0.7         b'1'               b'N'   \n",
              "2                1           10.9         b'1'               b'N'   \n",
              "3                1            0.2         b'1'               b'N'   \n",
              "4                1            4.8         b'1'               b'N'   \n",
              "\n",
              "   pickup_location_id  dropoff_location_id payment_type  fare_amount  extra  \\\n",
              "0                 239                  239         b'1'          4.0    3.0   \n",
              "1                 230                  100         b'2'          4.5    3.0   \n",
              "2                  68                  127         b'1'         36.0    3.0   \n",
              "3                  68                   68         b'2'          3.5    3.0   \n",
              "4                  50                   42         b'1'         15.5    3.0   \n",
              "\n",
              "   mta_tax  tip_amount  tolls_amount  improvement_surcharge  total_amount  \\\n",
              "0      0.5        1.00           0.0                    0.3          8.80   \n",
              "1      0.5        0.00           0.0                    0.3          8.30   \n",
              "2      0.5        7.95           0.0                    0.3         47.75   \n",
              "3      0.5        0.00           0.0                    0.3          7.30   \n",
              "4      0.5        3.85           0.0                    0.3         23.15   \n",
              "\n",
              "   congestion_surcharge  \n",
              "0                   2.5  \n",
              "1                   2.5  \n",
              "2                   2.5  \n",
              "3                   2.5  \n",
              "4                   2.5  "
            ]
          },
          "execution_count": 3,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "%%time\n",
        "chdb.query('SELECT * FROM file(\"taxi/*.parquet\", Parquet) LIMIT 5', 'Dataframe');\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "iudIWgW-e29W",
        "outputId": "c3bbf335-c476-4d93-b011-abac3604928b"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "  vendor_id           pickup_at          dropoff_at  passenger_count  \\\n",
            "0         1 2019-06-01 00:55:13 2019-06-01 00:56:17                1   \n",
            "1         1 2019-06-01 00:06:31 2019-06-01 00:06:52                1   \n",
            "2         1 2019-06-01 00:17:05 2019-06-01 00:36:38                1   \n",
            "3         1 2019-06-01 00:59:02 2019-06-01 00:59:12                0   \n",
            "4         1 2019-06-01 00:03:25 2019-06-01 00:15:42                1   \n",
            "\n",
            "   trip_distance rate_code_id store_and_fwd_flag  pickup_location_id  \\\n",
            "0            0.0            1                  N                 145   \n",
            "1            0.0            1                  N                 262   \n",
            "2            4.4            1                  N                  74   \n",
            "3            0.8            1                  N                 145   \n",
            "4            1.7            1                  N                 113   \n",
            "\n",
            "   dropoff_location_id payment_type  fare_amount  extra  mta_tax  tip_amount  \\\n",
            "0                  145            2          3.0    0.5      0.5        0.00   \n",
            "1                  263            2          2.5    3.0      0.5        0.00   \n",
            "2                    7            2         17.5    0.5      0.5        0.00   \n",
            "3                  145            2          2.5    1.0      0.5        0.00   \n",
            "4                  148            1          9.5    3.0      0.5        2.65   \n",
            "\n",
            "   tolls_amount  improvement_surcharge  total_amount  congestion_surcharge  \n",
            "0           0.0                    0.3      4.300000                   0.0  \n",
            "1           0.0                    0.3      6.300000                   2.5  \n",
            "2           0.0                    0.3     18.799999                   0.0  \n",
            "3           0.0                    0.3      4.300000                   0.0  \n",
            "4           0.0                    0.3     15.950000                   2.5  \n",
            "CPU times: user 9.94 s, sys: 5.02 s, total: 15 s\n",
            "Wall time: 16.8 s\n"
          ]
        }
      ],
      "source": [
        "%%time\n",
        "df = pandas.concat(\n",
        "\t[pandas.read_parquet(file)\n",
        "\t for file\n",
        "\t in glob.glob('taxi/*.parquet')])\n",
        "print(df.head(5))"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "08pmu4IN2TPA"
      },
      "source": [
        "## Concatenate the three files into a single large file\n",
        "\n",
        "As Pandas does not have native support for reading multiple files, we perform the remaining experiments on a single large file.\n",
        "\n",
        "We use the pyarrow library to concatenate the three files into a single file."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "CISvtKWrfgJP"
      },
      "outputs": [],
      "source": [
        "# concatenate all three parquet files into a single file\n",
        "pq.write_table(pq.ParquetDataset('taxi/').read(), 'alltaxi.parquet', row_group_size=100000)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "_ER2RRs24Ikx"
      },
      "source": [
        "### Querying the Single File\n",
        "\n",
        "Now let's query the single file and check the achieved performance."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 393
        },
        "id": "vujLPgUqHztl",
        "outputId": "858128b4-0262-4950-8c68-fe64615cd24e"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "CPU times: user 57.2 ms, sys: 12.3 ms, total: 69.5 ms\n",
            "Wall time: 71.6 ms\n"
          ]
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <div id=\"df-d1787ab6-dec8-4231-af2b-44fe67f4deab\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>vendor_id</th>\n",
              "      <th>pickup_at</th>\n",
              "      <th>dropoff_at</th>\n",
              "      <th>passenger_count</th>\n",
              "      <th>trip_distance</th>\n",
              "      <th>rate_code_id</th>\n",
              "      <th>store_and_fwd_flag</th>\n",
              "      <th>pickup_location_id</th>\n",
              "      <th>dropoff_location_id</th>\n",
              "      <th>payment_type</th>\n",
              "      <th>fare_amount</th>\n",
              "      <th>extra</th>\n",
              "      <th>mta_tax</th>\n",
              "      <th>tip_amount</th>\n",
              "      <th>tolls_amount</th>\n",
              "      <th>improvement_surcharge</th>\n",
              "      <th>total_amount</th>\n",
              "      <th>congestion_surcharge</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>b'1'</td>\n",
              "      <td>2019-04-01 00:04:09+00:00</td>\n",
              "      <td>2019-04-01 00:06:35+00:00</td>\n",
              "      <td>1</td>\n",
              "      <td>0.5</td>\n",
              "      <td>b'1'</td>\n",
              "      <td>b'N'</td>\n",
              "      <td>239</td>\n",
              "      <td>239</td>\n",
              "      <td>b'1'</td>\n",
              "      <td>4.0</td>\n",
              "      <td>3.0</td>\n",
              "      <td>0.5</td>\n",
              "      <td>1.00</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.3</td>\n",
              "      <td>8.80</td>\n",
              "      <td>2.5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>b'1'</td>\n",
              "      <td>2019-04-01 00:22:45+00:00</td>\n",
              "      <td>2019-04-01 00:25:43+00:00</td>\n",
              "      <td>1</td>\n",
              "      <td>0.7</td>\n",
              "      <td>b'1'</td>\n",
              "      <td>b'N'</td>\n",
              "      <td>230</td>\n",
              "      <td>100</td>\n",
              "      <td>b'2'</td>\n",
              "      <td>4.5</td>\n",
              "      <td>3.0</td>\n",
              "      <td>0.5</td>\n",
              "      <td>0.00</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.3</td>\n",
              "      <td>8.30</td>\n",
              "      <td>2.5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>b'1'</td>\n",
              "      <td>2019-04-01 00:39:48+00:00</td>\n",
              "      <td>2019-04-01 01:19:39+00:00</td>\n",
              "      <td>1</td>\n",
              "      <td>10.9</td>\n",
              "      <td>b'1'</td>\n",
              "      <td>b'N'</td>\n",
              "      <td>68</td>\n",
              "      <td>127</td>\n",
              "      <td>b'1'</td>\n",
              "      <td>36.0</td>\n",
              "      <td>3.0</td>\n",
              "      <td>0.5</td>\n",
              "      <td>7.95</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.3</td>\n",
              "      <td>47.75</td>\n",
              "      <td>2.5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>b'1'</td>\n",
              "      <td>2019-04-01 00:35:32+00:00</td>\n",
              "      <td>2019-04-01 00:37:11+00:00</td>\n",
              "      <td>1</td>\n",
              "      <td>0.2</td>\n",
              "      <td>b'1'</td>\n",
              "      <td>b'N'</td>\n",
              "      <td>68</td>\n",
              "      <td>68</td>\n",
              "      <td>b'2'</td>\n",
              "      <td>3.5</td>\n",
              "      <td>3.0</td>\n",
              "      <td>0.5</td>\n",
              "      <td>0.00</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.3</td>\n",
              "      <td>7.30</td>\n",
              "      <td>2.5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>b'1'</td>\n",
              "      <td>2019-04-01 00:44:05+00:00</td>\n",
              "      <td>2019-04-01 00:57:58+00:00</td>\n",
              "      <td>1</td>\n",
              "      <td>4.8</td>\n",
              "      <td>b'1'</td>\n",
              "      <td>b'N'</td>\n",
              "      <td>50</td>\n",
              "      <td>42</td>\n",
              "      <td>b'1'</td>\n",
              "      <td>15.5</td>\n",
              "      <td>3.0</td>\n",
              "      <td>0.5</td>\n",
              "      <td>3.85</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.3</td>\n",
              "      <td>23.15</td>\n",
              "      <td>2.5</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-d1787ab6-dec8-4231-af2b-44fe67f4deab')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-d1787ab6-dec8-4231-af2b-44fe67f4deab button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-d1787ab6-dec8-4231-af2b-44fe67f4deab');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ],
            "text/plain": [
              "  vendor_id                 pickup_at                dropoff_at  \\\n",
              "0      b'1' 2019-04-01 00:04:09+00:00 2019-04-01 00:06:35+00:00   \n",
              "1      b'1' 2019-04-01 00:22:45+00:00 2019-04-01 00:25:43+00:00   \n",
              "2      b'1' 2019-04-01 00:39:48+00:00 2019-04-01 01:19:39+00:00   \n",
              "3      b'1' 2019-04-01 00:35:32+00:00 2019-04-01 00:37:11+00:00   \n",
              "4      b'1' 2019-04-01 00:44:05+00:00 2019-04-01 00:57:58+00:00   \n",
              "\n",
              "   passenger_count  trip_distance rate_code_id store_and_fwd_flag  \\\n",
              "0                1            0.5         b'1'               b'N'   \n",
              "1                1            0.7         b'1'               b'N'   \n",
              "2                1           10.9         b'1'               b'N'   \n",
              "3                1            0.2         b'1'               b'N'   \n",
              "4                1            4.8         b'1'               b'N'   \n",
              "\n",
              "   pickup_location_id  dropoff_location_id payment_type  fare_amount  extra  \\\n",
              "0                 239                  239         b'1'          4.0    3.0   \n",
              "1                 230                  100         b'2'          4.5    3.0   \n",
              "2                  68                  127         b'1'         36.0    3.0   \n",
              "3                  68                   68         b'2'          3.5    3.0   \n",
              "4                  50                   42         b'1'         15.5    3.0   \n",
              "\n",
              "   mta_tax  tip_amount  tolls_amount  improvement_surcharge  total_amount  \\\n",
              "0      0.5        1.00           0.0                    0.3          8.80   \n",
              "1      0.5        0.00           0.0                    0.3          8.30   \n",
              "2      0.5        7.95           0.0                    0.3         47.75   \n",
              "3      0.5        0.00           0.0                    0.3          7.30   \n",
              "4      0.5        3.85           0.0                    0.3         23.15   \n",
              "\n",
              "   congestion_surcharge  \n",
              "0                   2.5  \n",
              "1                   2.5  \n",
              "2                   2.5  \n",
              "3                   2.5  \n",
              "4                   2.5  "
            ]
          },
          "execution_count": 6,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "%%time\n",
        "# chDB\n",
        "chdb.query('SELECT * FROM file(\"alltaxi.parquet\", Parquet) LIMIT 5', 'Dataframe');\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 393
        },
        "id": "6plLAHdN39y5",
        "outputId": "1c92232a-4383-4d78-f222-3e6179a95336"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "CPU times: user 12.3 s, sys: 4.23 s, total: 16.5 s\n",
            "Wall time: 10.8 s\n"
          ]
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <div id=\"df-ae87d1c4-e006-47fc-ac2c-ae8adeb14ba5\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>vendor_id</th>\n",
              "      <th>pickup_at</th>\n",
              "      <th>dropoff_at</th>\n",
              "      <th>passenger_count</th>\n",
              "      <th>trip_distance</th>\n",
              "      <th>rate_code_id</th>\n",
              "      <th>store_and_fwd_flag</th>\n",
              "      <th>pickup_location_id</th>\n",
              "      <th>dropoff_location_id</th>\n",
              "      <th>payment_type</th>\n",
              "      <th>fare_amount</th>\n",
              "      <th>extra</th>\n",
              "      <th>mta_tax</th>\n",
              "      <th>tip_amount</th>\n",
              "      <th>tolls_amount</th>\n",
              "      <th>improvement_surcharge</th>\n",
              "      <th>total_amount</th>\n",
              "      <th>congestion_surcharge</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>1</td>\n",
              "      <td>2019-04-01 00:04:09</td>\n",
              "      <td>2019-04-01 00:06:35</td>\n",
              "      <td>1</td>\n",
              "      <td>0.5</td>\n",
              "      <td>1</td>\n",
              "      <td>N</td>\n",
              "      <td>239</td>\n",
              "      <td>239</td>\n",
              "      <td>1</td>\n",
              "      <td>4.0</td>\n",
              "      <td>3.0</td>\n",
              "      <td>0.5</td>\n",
              "      <td>1.00</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.3</td>\n",
              "      <td>8.80</td>\n",
              "      <td>2.5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1</td>\n",
              "      <td>2019-04-01 00:22:45</td>\n",
              "      <td>2019-04-01 00:25:43</td>\n",
              "      <td>1</td>\n",
              "      <td>0.7</td>\n",
              "      <td>1</td>\n",
              "      <td>N</td>\n",
              "      <td>230</td>\n",
              "      <td>100</td>\n",
              "      <td>2</td>\n",
              "      <td>4.5</td>\n",
              "      <td>3.0</td>\n",
              "      <td>0.5</td>\n",
              "      <td>0.00</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.3</td>\n",
              "      <td>8.30</td>\n",
              "      <td>2.5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>1</td>\n",
              "      <td>2019-04-01 00:39:48</td>\n",
              "      <td>2019-04-01 01:19:39</td>\n",
              "      <td>1</td>\n",
              "      <td>10.9</td>\n",
              "      <td>1</td>\n",
              "      <td>N</td>\n",
              "      <td>68</td>\n",
              "      <td>127</td>\n",
              "      <td>1</td>\n",
              "      <td>36.0</td>\n",
              "      <td>3.0</td>\n",
              "      <td>0.5</td>\n",
              "      <td>7.95</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.3</td>\n",
              "      <td>47.75</td>\n",
              "      <td>2.5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>1</td>\n",
              "      <td>2019-04-01 00:35:32</td>\n",
              "      <td>2019-04-01 00:37:11</td>\n",
              "      <td>1</td>\n",
              "      <td>0.2</td>\n",
              "      <td>1</td>\n",
              "      <td>N</td>\n",
              "      <td>68</td>\n",
              "      <td>68</td>\n",
              "      <td>2</td>\n",
              "      <td>3.5</td>\n",
              "      <td>3.0</td>\n",
              "      <td>0.5</td>\n",
              "      <td>0.00</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.3</td>\n",
              "      <td>7.30</td>\n",
              "      <td>2.5</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>1</td>\n",
              "      <td>2019-04-01 00:44:05</td>\n",
              "      <td>2019-04-01 00:57:58</td>\n",
              "      <td>1</td>\n",
              "      <td>4.8</td>\n",
              "      <td>1</td>\n",
              "      <td>N</td>\n",
              "      <td>50</td>\n",
              "      <td>42</td>\n",
              "      <td>1</td>\n",
              "      <td>15.5</td>\n",
              "      <td>3.0</td>\n",
              "      <td>0.5</td>\n",
              "      <td>3.85</td>\n",
              "      <td>0.0</td>\n",
              "      <td>0.3</td>\n",
              "      <td>23.15</td>\n",
              "      <td>2.5</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-ae87d1c4-e006-47fc-ac2c-ae8adeb14ba5')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-ae87d1c4-e006-47fc-ac2c-ae8adeb14ba5 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-ae87d1c4-e006-47fc-ac2c-ae8adeb14ba5');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ],
            "text/plain": [
              "  vendor_id           pickup_at          dropoff_at  passenger_count  \\\n",
              "0         1 2019-04-01 00:04:09 2019-04-01 00:06:35                1   \n",
              "1         1 2019-04-01 00:22:45 2019-04-01 00:25:43                1   \n",
              "2         1 2019-04-01 00:39:48 2019-04-01 01:19:39                1   \n",
              "3         1 2019-04-01 00:35:32 2019-04-01 00:37:11                1   \n",
              "4         1 2019-04-01 00:44:05 2019-04-01 00:57:58                1   \n",
              "\n",
              "   trip_distance rate_code_id store_and_fwd_flag  pickup_location_id  \\\n",
              "0            0.5            1                  N                 239   \n",
              "1            0.7            1                  N                 230   \n",
              "2           10.9            1                  N                  68   \n",
              "3            0.2            1                  N                  68   \n",
              "4            4.8            1                  N                  50   \n",
              "\n",
              "   dropoff_location_id payment_type  fare_amount  extra  mta_tax  tip_amount  \\\n",
              "0                  239            1          4.0    3.0      0.5        1.00   \n",
              "1                  100            2          4.5    3.0      0.5        0.00   \n",
              "2                  127            1         36.0    3.0      0.5        7.95   \n",
              "3                   68            2          3.5    3.0      0.5        0.00   \n",
              "4                   42            1         15.5    3.0      0.5        3.85   \n",
              "\n",
              "   tolls_amount  improvement_surcharge  total_amount  congestion_surcharge  \n",
              "0           0.0                    0.3          8.80                   2.5  \n",
              "1           0.0                    0.3          8.30                   2.5  \n",
              "2           0.0                    0.3         47.75                   2.5  \n",
              "3           0.0                    0.3          7.30                   2.5  \n",
              "4           0.0                    0.3         23.15                   2.5  "
            ]
          },
          "execution_count": 7,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "%%time\n",
        "pandas.read_parquet('alltaxi.parquet').head(5)"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "SQZyP6Er6B-8"
      },
      "source": [
        "## Counting the Rows\n",
        "\n",
        "Now suppose we want to figure out how many rows are in our data set. We can do that using the following code snippets.\n",
        "\n",
        "Note that by default Pandas will read the entire Parquet file into memory again. We can manually optimize the query by specifying that only a single column should be loaded.\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 116
        },
        "id": "7usVA6OSlsOE",
        "outputId": "f40f1eed-9f3d-475c-bb18-cc3146e9569a"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "CPU times: user 182 ms, sys: 83 ms, total: 265 ms\n",
            "Wall time: 255 ms\n"
          ]
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <div id=\"df-3694cf79-e8fd-4ec5-9019-8902c8dbb927\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>count()</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>21939424</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-3694cf79-e8fd-4ec5-9019-8902c8dbb927')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-3694cf79-e8fd-4ec5-9019-8902c8dbb927 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-3694cf79-e8fd-4ec5-9019-8902c8dbb927');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ],
            "text/plain": [
              "    count()\n",
              "0  21939424"
            ]
          },
          "execution_count": 8,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "%%time\n",
        "# chDB\n",
        "chdb.query('SELECT count(*) FROM file(\"alltaxi.parquet\", Parquet) LIMIT 5', 'Dataframe');\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "t4-v5dL5H921",
        "outputId": "aa4fa24b-0dd4-4d28-d4e7-a64a4fcb5c39"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "21939424\n",
            "CPU times: user 11.4 s, sys: 5.53 s, total: 16.9 s\n",
            "Wall time: 11.3 s\n"
          ]
        }
      ],
      "source": [
        "%%time\n",
        "# Pandas (naive)\n",
        "print(len(pandas.read_parquet('alltaxi.parquet')))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "JwYp_WhvIkW0",
        "outputId": "4ef169c4-7276-455a-e7a7-817319712ded"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "21939424\n",
            "CPU times: user 1.18 s, sys: 473 ms, total: 1.65 s\n",
            "Wall time: 1e+03 ms\n"
          ]
        }
      ],
      "source": [
        "%%time\n",
        "# Pandas (projection pushdown)\n",
        "print(len(pandas.read_parquet('alltaxi.parquet', columns=['vendor_id'])))"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "frXCfU-u8p9R"
      },
      "source": [
        "# Filtering Rows\n",
        "It is common to use some sort of filtering predicate to only look at the interesting parts of a data set. For example, imagine we want to know how many taxi rides occur after the 30th of June 2019. We can do that using the following queries in both Pandas and chDB.\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 116
        },
        "id": "TVQpHs4DJHAQ",
        "outputId": "68bacc94-08bc-4d6e-a8da-c71024157249"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "CPU times: user 386 ms, sys: 156 ms, total: 543 ms\n",
            "Wall time: 515 ms\n"
          ]
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <div id=\"df-a2469398-4814-482c-89a6-c072a3e70cc7\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>count()</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>167022</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-a2469398-4814-482c-89a6-c072a3e70cc7')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-a2469398-4814-482c-89a6-c072a3e70cc7 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-a2469398-4814-482c-89a6-c072a3e70cc7');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ],
            "text/plain": [
              "   count()\n",
              "0   167022"
            ]
          },
          "execution_count": 11,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "%%time\n",
        "# chDB\n",
        "chdb.query(\"SELECT count(*) FROM file('alltaxi.parquet', Parquet)  WHERE pickup_at > '2019-06-30'\", 'Dataframe');"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "nH-nuAfCJTRr",
        "outputId": "c2d71280-205a-499f-9666-26785846ce59"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "CPU times: user 8.72 s, sys: 3.88 s, total: 12.6 s\n",
            "Wall time: 8.64 s\n"
          ]
        },
        {
          "data": {
            "text/plain": [
              "167022"
            ]
          },
          "execution_count": 12,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "%%time\n",
        "# Pandas (naive)\n",
        "len(pandas.read_parquet('alltaxi.parquet')\n",
        "          .query(\"pickup_at > '2019-06-30'\"))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "hUdGqKBTJutf",
        "outputId": "14aa69c0-efb8-4d57-ce7d-c2e2e40b9d27"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "CPU times: user 758 ms, sys: 670 ms, total: 1.43 s\n",
            "Wall time: 1.11 s\n"
          ]
        },
        {
          "data": {
            "text/plain": [
              "167022"
            ]
          },
          "execution_count": 13,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "%%time\n",
        "# Pandas (projection pushdown)\n",
        "len(pandas.read_parquet('alltaxi.parquet', columns=['pickup_at'])\n",
        "          .query(\"pickup_at > '2019-06-30'\"))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "pDVawGeiCCdR"
      },
      "outputs": [],
      "source": [
        "df = pandas.read_parquet('alltaxi.parquet')"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "td7w1Pi-CGR8",
        "outputId": "db9e4c45-ee59-4dfe-ea54-908048ed42da"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "167022\n",
            "CPU times: user 547 ms, sys: 556 ms, total: 1.1 s\n",
            "Wall time: 883 ms\n"
          ]
        }
      ],
      "source": [
        "%%time\n",
        "# Pandas native\n",
        "print(len(df[['pickup_at']].query(\"pickup_at > '2019-06-30'\")))"
      ]
    },
    {
      "attachments": {},
      "cell_type": "markdown",
      "metadata": {
        "id": "7vfhR9FHDYrp"
      },
      "source": [
        "## Aggregates\n",
        "\n",
        "Now suppose we want to figure out how many rows are in our data set. We can do that using the following code snippets.\n",
        "\n",
        "Note that by default Pandas will read the entire Parquet file into memory again. We can manually optimize the query by specifying that only a single column should be loaded.\n",
        "\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "bPlEXfuIKbf2",
        "outputId": "7d678df4-f3d4-4929-a647-7436e45953b5"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "CPU times: user 839 ms, sys: 185 ms, total: 1.02 s\n",
            "Wall time: 1.07 s\n"
          ]
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <div id=\"df-28bba22b-88f9-4065-bd24-d22d4c0104cb\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>passenger_count</th>\n",
              "      <th>count()</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>0</td>\n",
              "      <td>408742</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>7</td>\n",
              "      <td>106</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>1</td>\n",
              "      <td>15356631</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>6</td>\n",
              "      <td>546467</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>9</td>\n",
              "      <td>64</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>5</th>\n",
              "      <td>2</td>\n",
              "      <td>3332927</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6</th>\n",
              "      <td>5</td>\n",
              "      <td>910516</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>7</th>\n",
              "      <td>8</td>\n",
              "      <td>72</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>8</th>\n",
              "      <td>3</td>\n",
              "      <td>944833</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>9</th>\n",
              "      <td>4</td>\n",
              "      <td>439066</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-28bba22b-88f9-4065-bd24-d22d4c0104cb')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-28bba22b-88f9-4065-bd24-d22d4c0104cb button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-28bba22b-88f9-4065-bd24-d22d4c0104cb');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ],
            "text/plain": [
              "   passenger_count   count()\n",
              "0                0    408742\n",
              "1                7       106\n",
              "2                1  15356631\n",
              "3                6    546467\n",
              "4                9        64\n",
              "5                2   3332927\n",
              "6                5    910516\n",
              "7                8        72\n",
              "8                3    944833\n",
              "9                4    439066"
            ]
          },
          "execution_count": 16,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "%%time\n",
        "# chDB (SQL)\n",
        "chdb.query(\"SELECT passenger_count, count(*) FROM file('alltaxi.parquet', Parquet) GROUP BY passenger_count\", 'Dataframe');"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "xchbsEwoL5ll",
        "outputId": "7ec49870-df0d-4a85-cd23-9309eb645a7a"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "CPU times: user 8.67 s, sys: 4.34 s, total: 13 s\n",
            "Wall time: 8.37 s\n"
          ]
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <div id=\"df-f4c50dee-dab7-4c8e-9859-d0ee9d9ef00e\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>passenger_count</th>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>passenger_count</th>\n",
              "      <th></th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>408742</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>15356631</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>3332927</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>944833</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>439066</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>5</th>\n",
              "      <td>910516</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6</th>\n",
              "      <td>546467</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>7</th>\n",
              "      <td>106</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>8</th>\n",
              "      <td>72</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>9</th>\n",
              "      <td>64</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-f4c50dee-dab7-4c8e-9859-d0ee9d9ef00e')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-f4c50dee-dab7-4c8e-9859-d0ee9d9ef00e button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-f4c50dee-dab7-4c8e-9859-d0ee9d9ef00e');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ],
            "text/plain": [
              "                 passenger_count\n",
              "passenger_count                 \n",
              "0                         408742\n",
              "1                       15356631\n",
              "2                        3332927\n",
              "3                         944833\n",
              "4                         439066\n",
              "5                         910516\n",
              "6                         546467\n",
              "7                            106\n",
              "8                             72\n",
              "9                             64"
            ]
          },
          "execution_count": 17,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "%%time\n",
        "# Pandas (naive)\n",
        "pandas.read_parquet('alltaxi.parquet').groupby('passenger_count').agg({'passenger_count' : 'count'})"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "K8WEZBEYMEja",
        "outputId": "1500dd57-eecb-4e0c-85e6-c569b7eb539d"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "CPU times: user 533 ms, sys: 179 ms, total: 712 ms\n",
            "Wall time: 529 ms\n"
          ]
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <div id=\"df-37485f54-be21-4984-b027-8e1c16b7488a\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>passenger_count</th>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>passenger_count</th>\n",
              "      <th></th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>408742</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>15356631</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>3332927</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>944833</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>439066</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>5</th>\n",
              "      <td>910516</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6</th>\n",
              "      <td>546467</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>7</th>\n",
              "      <td>106</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>8</th>\n",
              "      <td>72</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>9</th>\n",
              "      <td>64</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-37485f54-be21-4984-b027-8e1c16b7488a')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-37485f54-be21-4984-b027-8e1c16b7488a button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-37485f54-be21-4984-b027-8e1c16b7488a');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ],
            "text/plain": [
              "                 passenger_count\n",
              "passenger_count                 \n",
              "0                         408742\n",
              "1                       15356631\n",
              "2                        3332927\n",
              "3                         944833\n",
              "4                         439066\n",
              "5                         910516\n",
              "6                         546467\n",
              "7                            106\n",
              "8                             72\n",
              "9                             64"
            ]
          },
          "execution_count": 18,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "%%time\n",
        "# Pandas (projection pushdown)\n",
        "pandas.read_parquet('alltaxi.parquet', columns=['passenger_count']).groupby('passenger_count').agg({'passenger_count' : 'count'})"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "cTdx54ecw7x9",
        "outputId": "b8ccf54e-f5c3-4602-9be3-14b770e8dfe9"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "CPU times: user 334 ms, sys: 87.8 ms, total: 421 ms\n",
            "Wall time: 547 ms\n"
          ]
        },
        {
          "data": {
            "text/html": [
              "\n",
              "  <div id=\"df-f823abea-7f39-40f4-b9bb-aa65faf44094\">\n",
              "    <div class=\"colab-df-container\">\n",
              "      <div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>passenger_count</th>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>passenger_count</th>\n",
              "      <th></th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>408742</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>15356631</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>3332927</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>944833</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>439066</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>5</th>\n",
              "      <td>910516</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6</th>\n",
              "      <td>546467</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>7</th>\n",
              "      <td>106</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>8</th>\n",
              "      <td>72</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>9</th>\n",
              "      <td>64</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>\n",
              "      <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-f823abea-7f39-40f4-b9bb-aa65faf44094')\"\n",
              "              title=\"Convert this dataframe to an interactive table.\"\n",
              "              style=\"display:none;\">\n",
              "        \n",
              "  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n",
              "       width=\"24px\">\n",
              "    <path d=\"M0 0h24v24H0V0z\" fill=\"none\"/>\n",
              "    <path d=\"M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z\"/><path d=\"M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z\"/>\n",
              "  </svg>\n",
              "      </button>\n",
              "      \n",
              "  <style>\n",
              "    .colab-df-container {\n",
              "      display:flex;\n",
              "      flex-wrap:wrap;\n",
              "      gap: 12px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert {\n",
              "      background-color: #E8F0FE;\n",
              "      border: none;\n",
              "      border-radius: 50%;\n",
              "      cursor: pointer;\n",
              "      display: none;\n",
              "      fill: #1967D2;\n",
              "      height: 32px;\n",
              "      padding: 0 0 0 0;\n",
              "      width: 32px;\n",
              "    }\n",
              "\n",
              "    .colab-df-convert:hover {\n",
              "      background-color: #E2EBFA;\n",
              "      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n",
              "      fill: #174EA6;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert {\n",
              "      background-color: #3B4455;\n",
              "      fill: #D2E3FC;\n",
              "    }\n",
              "\n",
              "    [theme=dark] .colab-df-convert:hover {\n",
              "      background-color: #434B5C;\n",
              "      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n",
              "      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n",
              "      fill: #FFFFFF;\n",
              "    }\n",
              "  </style>\n",
              "\n",
              "      <script>\n",
              "        const buttonEl =\n",
              "          document.querySelector('#df-f823abea-7f39-40f4-b9bb-aa65faf44094 button.colab-df-convert');\n",
              "        buttonEl.style.display =\n",
              "          google.colab.kernel.accessAllowed ? 'block' : 'none';\n",
              "\n",
              "        async function convertToInteractive(key) {\n",
              "          const element = document.querySelector('#df-f823abea-7f39-40f4-b9bb-aa65faf44094');\n",
              "          const dataTable =\n",
              "            await google.colab.kernel.invokeFunction('convertToInteractive',\n",
              "                                                     [key], {});\n",
              "          if (!dataTable) return;\n",
              "\n",
              "          const docLinkHtml = 'Like what you see? Visit the ' +\n",
              "            '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n",
              "            + ' to learn more about interactive tables.';\n",
              "          element.innerHTML = '';\n",
              "          dataTable['output_type'] = 'display_data';\n",
              "          await google.colab.output.renderOutput(dataTable, element);\n",
              "          const docLink = document.createElement('div');\n",
              "          docLink.innerHTML = docLinkHtml;\n",
              "          element.appendChild(docLink);\n",
              "        }\n",
              "      </script>\n",
              "    </div>\n",
              "  </div>\n",
              "  "
            ],
            "text/plain": [
              "                 passenger_count\n",
              "passenger_count                 \n",
              "0                         408742\n",
              "1                       15356631\n",
              "2                        3332927\n",
              "3                         944833\n",
              "4                         439066\n",
              "5                         910516\n",
              "6                         546467\n",
              "7                            106\n",
              "8                             72\n",
              "9                             64"
            ]
          },
          "execution_count": 19,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "%%time\n",
        "# Pandas (native)\n",
        "df[['passenger_count']].groupby('passenger_count').agg({'passenger_count' : 'count'})"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}
